%matplotlib inline 
#above allows plots to discplay on the screen. 
#python includes
import sys
#standard probability includes:
import numpy as np #matrices and data structures
import scipy.stats as ss #standard statistical operations
import pandas as pd #keeps data organized, works well with data
import matplotlib
import matplotlib.pyplot as plt #plot visualization
#Method to load data
def getConllTags(filename):
    #input: filename for a conll style parts of speech tagged file
    #output: a list of list of tuples [sent]. representing [[[word1, tag], [word2, tag2]]
    wordTagsPerSent = [[]]
    sentNum = 0
    with open(filename, encoding='utf8') as f:
        for wordtag in f: 
            wordtag=wordtag.strip()
            if wordtag:#still reading current sentence
                (word, tag) = wordtag.split("\t")
                wordTagsPerSent[sentNum].append((word,tag))
            else:#new sentence
                wordTagsPerSent.append([])
                sentNum+=1
    return wordTagsPerSent  
corpus = 'daily547.conll'
taggedSents = getConllTags(corpus)
print(taggedSents[:2])
from pprint import pprint
#keep track of counts here:
wordCounts = dict()
bigramCounts = dict()
trigramCounts = dict()
numTrainingSents = 500
#iterate through each sentence, and extract word and bigram counts
for sent in taggedSents[:numTrainingSents]: 
    words = [word.lower() for word, tag in sent] # grabbing words, droppin gtags
    #print("\nNext Sent:", words)
    for i in range(len(words)):
        try: 
            wordCounts[(words[i],)] += 1
        except KeyError:
            wordCounts[(words[i],)] = 1
            
        #count the bigram
        if (i > 0):
            bigram = (words[i-1],words[i])
            try: 
                bigramCounts[bigram] += 1
            except KeyError:
                bigramCounts[bigram] = 1
                
                
        #count the trigrams
        if (i > 1):
            trigram = (words[i-2], words[i-1], words[i])
            try: 
                trigramCounts[trigram] += 1
            except KeyError:
                trigramCounts[trigram] = 1
pprint(sorted(wordCounts.items(), key=lambda kv: kv[1], reverse=True)[:20])
pprint(sorted(bigramCounts.items(), key=lambda kv: kv[1], reverse=True)[:20])
pprint(sorted(trigramCounts.items(), key=lambda kv: kv[1], reverse=True)[:20])
    
        
#specify the model (e.g. bigramCounts or trigramCounts)
ngramCounts = trigramCounts
#ngramCounts = bigramCounts
ngramModelProbs = dict()# stores p(Xi|Xi-1), [x--k...x-1][xi]
for ngram, count in ngramCounts.items():
        p = count / bigramCounts[ngram[0:-1]]
        try: 
            ngramModelProbs[ngram[0:-1]][ngram[-1]] = p #indexed by [x--k...x-1][xi]
        except KeyError:
            ngramModelProbs[ngram[0:-1]] = {ngram[-1]: p}
pprint(sorted(ngramModelProbs[('i','love')].items()))#show probabilities for all words that could follow want
#pprint(sorted(ngramModelProbs[('i',)].items()))#show probabilities for all words that could follow i
#if time. generate a sentence
#saved code in case we want to do one-hot representation
    for sent in taggedSents:
        if sent: 
            words, tags = zip(*sent)
            wordToIndex |= set(words) #union of the words into the set
            tagToNum |= set(tags) #union of all the tags into the set
    print("[Read ", len(taggedSents), " Sentences]")
    #make dictionaries for converting words to index and tags to ids:
    wordToIndex = {w: i for i, w in enumerate(wordToIndex)} 
    numToTag = list(tagToNum) #mapping index to tag
    tagToNum = {numToTag[i]: i for i in range(len(numToTag))}